# -*- coding: utf-8 -*-
import scrapy
import json
from gkzxprovince.items import GkzxprovinceItem

class ProvincespiderSpider(scrapy.Spider):
    name = 'provinceSpider'
    # allowed_domains = ['api.eol.cn/gkcx/api/?access_token=']
    # 爬取的URL地址
    start_urls = []

    def __init__(self):
        self.start_urls = self.gkzxprovinceUrl()

    # 简单的反爬虫
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Cookie':
        'tool_ipuse=172.16.30.144; tool_ipprovince=99; ip=119.85.171.146; ipareaname=%E9%87%8D%E5%BA%86'
    }

    # 重写scrapy方法
    def start_requests(self):
        for url in self.start_urls:
            print("url===="+url)
            yield scrapy.Request(url=url, callback=self.parse, headers=self.headers)

    def parse(self, response):
        item = GkzxprovinceItem()
        # 获取到json格式的数据
        rs = json.loads(response.text) # print(rs)
        # 获取需要的数据
        datas = rs['data']  # print(datas)
        items = datas['item'] # print(items[0])
        for i in items:
            # print(i['id'])
            # item['id'] = i['id']
            # item['year'] = i['year']
            # item['average'] = i['average']
            # item['local_province_name'] = i['local_province_name']
            # item['local_type_name'] = i['local_type_name']
            # item['local_batch_name'] = i['local_batch_name']
            # print(i['local_batch_name']) 本科批
            yield i

    def gkzxprovinceUrl(self):
        urls = []
        citys = {
            '北京':'11','天津':'12','河北':'13','山西':'14','内蒙':'15','辽宁':'21','吉林':'22','黑龙江':'23','上海':'31',
            '江苏': '32','浙江':'33','安徽':'34','福建':'35','江西':'36','山东':'37','河南':'41','湖北':'42','湖南':'43',
            '广东': '44','广西':'45','海南':'46','四川':'51','重庆':'50','贵州':'52','云南':'53','西藏':'54','陕西':'61',
            '甘肃': '62','青海':'63','宁夏':'64','新疆':'65'
        }
        # 查询2014年到2019年的
        for i in range(2014, 2020):
            for v in citys.items():
                # print(k)
                # print(v[1])
                url = F'https://api.eol.cn/gkcx/api/?access_token=&page=1&province_id={v[1]}' \
                      F'&signsafe=&size=20&uri=apidata/api/gk/score/proprovince&year={i}'
                # print(url)
                urls.append(url)
        return urls
if __name__ == '__main__':
    gk = ProvincespiderSpider()
    list = gk.gkzxprovinceUrl()
